** Clearing Stata memory
capture log close
clear all
set more off, perm
set seed 1234

///////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// Table O.4: Descriptive Statistics ///////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////

** Opening Phase 2 norm_scores dataset 
use "Work Data/Gender_Phase2_long.dta",clear

*** Creating variables
encode subject, gen (sub)
tab subject, gen (d_sub)
label var sub "Subject"

** Subject dummies
rename d_sub1 Biology
rename d_sub2 Chemistry
rename d_sub3 Geography
rename d_sub4 History
rename d_sub5 Language
rename d_sub6 Mathematics
rename d_sub7 Physics
rename d_sub8 Portuguese
* Labels
label var Biology "Biology"
label var Chemistry "Chemistry"
label var Geography "Geography"
label var History "History"
label var Math "Mathematics"
label var Physics "Physics"
label var Portuguese "Portuguese"
label var Language "Foreign Language"

** Labeling variables
label var priority "Priority"
label var female "Women"
label var enem "ENEM scores"
label var norm_enem_w "Norm. ENEM scores"
label var norm_p1score "P1 normalized subject-specific scores"


**************************
* Descriptive Statistics *
**************************

* # of priority discipline per applicant
bys inscri2: egen n_priority=sum(priority)
sum n_priority
label var n_priority "\# of priority subjects"

*  Proportion for which each subject is a priority
foreach v of varlist Biology - Portuguese {
gen prio_`v'=0 if priority~=.
replace prio_`v'=1 if `v'==1 & priority==1
bys inscri2: egen priority_`v'=max(prio_`v')
drop prio_`v'
label var priority_`v' "`v' is a priority subject"
}
sum priority_*

sum n_priority- priority_Portuguese

*********************************************************************************
**************** Main sample ****************************************************
*********************************************************************************

* 1) Only years before the affirmative action took place
drop if aa_year==1
drop if year==2000
tab year

preserve

* 2) Drop Portuguese and Foreign Language (in Phase 1 there is no Portuguese or Foreign Language exams - For Portuguese Phase 1 has an essay)
 tab subject, sum(norm_p1score)
 drop if subject=="lang" | subject=="port" 
 tab subject, sum(norm_p1score)

merge m:1 year career_choice using "Work data/cutoffs.dta"
label var cutoff "Major cutoff"

bys inscri2: egen mean_normalized_score_p2=mean(norm_score)
label var mean_normalized_score_p2 "Normalized P2 scores (average)"
sum norm_score mean_normalized_score_p2

gen prio_weight=1 if priority==0
replace prio_weight=2 if priority==1
bys inscri2: egen mean_normalized_score_p2_w = wtmean( norm_score), weight(prio_weight)
label var mean_normalized_score_p2_w "Normalized P2 scores (weighted average)"

tab sub
bys inscri2: egen mean_normalized_score_p1=mean(norm_p1score)
label var mean_normalized_score_p1 "Normalized P1 scores (average)"
sum norm_p1score mean_normalized_score_p1

gen score_sd=score
gen coef_var_p2score=score

* Copy labels before collapse
foreach v of var * {
local l`v' : variable label `v'
 if `"`l`v''"' == "" {
local l`v' "`v'"
}
}

** Within-student standard deviations

xtset inscri2
xtsum norm_score
bys inscri2: egen norm_score_avg_inscri2=mean(norm_score)
egen norm_score_avg=mean(norm_score)
gen within_p2= norm_score - norm_score_avg_inscri2 + norm_score_avg
sum within_p2 

xtsum norm_score if norm_score~=.
sum within_p2 if norm_score~=.
scalar within_sd_p2 = r(sd) 
dis within_sd_p2

xtsum norm_score if female==1
sum within_p2 if norm_score~=. & female==1
scalar within_sd_p2_female = r(sd) 
dis within_sd_p2_female

xtsum norm_score if female==0
sum within_p2 if norm_score~=. & female==0
scalar within_sd_p2_male = r(sd) 
dis within_sd_p2_male

xtset inscri2
xtsum norm_p1score
bys inscri2: egen norm_p1score_avg_inscri2=mean(norm_p1score)
egen norm_p1score_avg=mean(norm_p1score)
gen within_p1= norm_p1score - norm_p1score_avg_inscri2 + norm_p1score_avg
sum within_p1 

xtsum norm_p1score
sum within_p1 if norm_p1score~=.
scalar within_sd_p1 = r(sd) 
dis within_sd_p1

xtsum norm_p1score if female==1
sum within_p1 if norm_p1score~=. & female==1
scalar within_sd_p1_female = r(sd) 
dis within_sd_p1_female

xtsum norm_p1score if female==0
sum within_p1 if norm_p1score~=. & female==0
scalar within_sd_p1_male = r(sd) 
dis within_sd_p1_male

** Collapse variables

gcollapse (cv) coef_var_p2score (sd) score_sd  (mean) female agejun norm_enem_w n_priority priority_* cutoff mean_normalized_score_p2 /// 
mean_normalized_score_p2_w mean_normalized_score_p1, by(inscri2)

drop if inscri2==.

 * Paste labels
 foreach v of var * {
 label var `v' "`l`v''"
  }
  
label var coef_var_p2score "P2 score coefficient of variation"
label var score_sd "P2 score standard deviation"
label var agejun "Age"

gen male=1-female
tab male female

merge 1:1 inscri2 using "Work Data/RAIS_cleaned.dta"
drop if _merge==2
tab _merge  
gen matched=(_merge==3)
tab matched
label var matched "Match rate - Formal labor market"

forvalues i=7(1)12 {
gen year`i'=year+`i'
tab year`i', mi
gen annual_wage_after`i'=.
levelsof year`i', local(levels) 
foreach l of local levels {
replace annual_wage_after`i'=mwagetot`l' if year`i'==`l'
sum  annual_wage_after`i'
}
}

**** Average and maximum wage

foreach x in annual_wage  { 
    
* Average
egen avg_`x'_1014=rowmean(`x'_after10 `x'_after11 `x'_after12 `x'_after13 `x'_after14) // 10-14 years after
egen avg_`x'_814=rowmean(`x'_after8 `x'_after9 `x'_after10 `x'_after11 `x'_after12 `x'_after13 `x'_after14)  // 8-14 years after
egen avg_`x'_614=rowmean(`x'_after6  `x'_after7 `x'_after8 `x'_after9 `x'_after10 `x'_after11 `x'_after12 `x'_after13 `x'_after14) // 6-14 years after
egen avg_`x'_712=rowmean(`x'_after7 `x'_after8 `x'_after9 `x'_after10 `x'_after11 `x'_after12) // 7-12 years after
sum avg_`x'*
sum avg_`x'* if _merge==1

* Maximum
egen max_`x'_1014=rowmax(`x'_after10 `x'_after11 `x'_after12 `x'_after13 `x'_after14) // 10-14 years after
egen max_`x'_814=rowmax(`x'_after8 `x'_after9 `x'_after10 `x'_after11 `x'_after12 `x'_after13 `x'_after14)  // 8-14 years after
egen max_`x'_614=rowmax(`x'_after6  `x'_after7 `x'_after8 `x'_after9 `x'_after10 `x'_after11 `x'_after12 `x'_after13 `x'_after14) // 6-14 years after
egen max_`x'_712=rowmax(`x'_after7 `x'_after8 `x'_after9 `x'_after10 `x'_after11 `x'_after12) // 7-12 years after
sum max_`x'*
sum max_`x'* if _merge==1
}

label var avg_annual_wage_712 "Avg annual wages - 7 to 12 years after exam"
label var max_annual_wage_712 "Max annual wages - 7 to 12 years after exam"

gen matched_712=0
replace matched_712=1 if avg_annual_wage_712~=.
replace matched_712=1 if max_annual_wage_712~=.
tab matched matched_712,mi
label var matched_712 "Match rate - RAIS 7 to 12 years after exam"

estimates clear

* "Continuous" + female (no ttest)
estpost sum female agejun norm_enem_w n_priority cutoff
eststo full_sample
estpost sum agejun norm_enem_w n_priority cutoff if female==1
eststo sum_female
estpost sum agejun norm_enem_w n_priority cutoff if female==0
eststo sum_male
estpost ttest agejun norm_enem_w n_priority cutoff , by(male)
eststo ttest

esttab full_sample sum_female sum_male ttest using "Output/Descriptive_table.tex", nolines noobs nonumbers nomtitle  nogap  ///
cells( (mean(fmt(2) pattern (1 1 1 0)) b(star fmt(2) pattern (0 0 0 1))) (sd(par  pattern (1 1 1 0) fmt(2))))  /// 
replace collabels(none) mgroups("Full sample" "Women" "Men" "Difference" , pattern(1 1 1 1)  prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) ///
booktabs label f star(* 0.10 ** 0.05 *** 0.01) 

* Binary
estpost sum priority_Biology priority_Chemistry priority_Geography priority_History  priority_Mathematics priority_Physics priority_Portuguese
eststo full_sample
estpost sum priority_Biology priority_Chemistry priority_Geography priority_History  priority_Mathematics priority_Physics priority_Portuguese if female==1
eststo sum_female
estpost sum priority_Biology priority_Chemistry priority_Geography priority_History  priority_Mathematics priority_Physics priority_Portuguese if female==0
eststo sum_male
estpost prtest priority_Biology priority_Chemistry priority_Geography priority_History  priority_Mathematics priority_Physics priority_Portuguese, by(male)
eststo prtest

esttab full_sample sum_female sum_male prtest using "Output/Descriptive_table.tex", ///
cells( (mean(fmt(2) pattern (1 1 1 0)) b(star fmt(2) pattern (0 0 0 1))) (sd(par  pattern (1 1 1 0) fmt(2)))) ///
 nonumbers nomtitle nolines nogap  append  collabels(none) label f noobs star(* 0.10 ** 0.05 *** 0.01) booktabs

* Continuous
estpost sum mean_normalized_score_p1 mean_normalized_score_p2 mean_normalized_score_p2_w score_sd coef_var_p2score
eststo full_sample, addscalars(within_sd_p2 within_sd_p2 within_sd_p1 within_sd_p1 )
estpost sum mean_normalized_score_p1 mean_normalized_score_p2 mean_normalized_score_p2_w score_sd coef_var_p2score if female==1
eststo sum_female, addscalars(within_sd_p2 within_sd_p2_female within_sd_p1 within_sd_p1_female )
estpost sum mean_normalized_score_p1 mean_normalized_score_p2 mean_normalized_score_p2_w score_sd coef_var_p2score if female==0
eststo sum_male, addscalars(within_sd_p2 within_sd_p2_male within_sd_p1 within_sd_p1_male )
estpost ttest mean_normalized_score_p1 mean_normalized_score_p2 mean_normalized_score_p2_w score_sd coef_var_p2score, by(male)
eststo ttest

esttab full_sample sum_female sum_male ttest using "Output/Descriptive_table.tex", ///
cells( (mean(fmt(2) pattern (1 1 1 0)) b(star fmt(2) pattern (0 0 0 1))) (sd(par  pattern (1 1 1 0) fmt(2)))) ///
 nonumbers nomtitle nolines nogap  append  collabels(none) label f noobs star(* 0.10 ** 0.05 *** 0.01) booktabs

 * Continuous
estpost sum avg_annual_wage_712 max_annual_wage_712
eststo full_sample
estpost sum avg_annual_wage_712 max_annual_wage_712 if female==1
eststo sum_female
estpost sum avg_annual_wage_712 max_annual_wage_712 if female==0
eststo sum_male
estpost ttest avg_annual_wage_712 max_annual_wage_712, by(male)
eststo ttest

esttab full_sample sum_female sum_male ttest using "Output/Descriptive_table.tex", ///
cells( (mean(fmt("%9.0fc") pattern (1 1 1 0)) b(star fmt("%9.0fc") pattern (0 0 0 1))) (sd(par  pattern (1 1 1 0) fmt("%9.0fc")))) ///
 nonumbers nomtitle nolines nogap  append  collabels(none) label f noobs star(* 0.10 ** 0.05 *** 0.01) booktabs

* Binary
estpost sum matched_712
eststo full_sample, addscalars(within_sd_p2 within_sd_p2 within_sd_p1 within_sd_p1 )
estpost sum matched_712 if female==1
eststo sum_female, addscalars(within_sd_p2 within_sd_p2_female within_sd_p1 within_sd_p1_female )
estpost sum matched_712 if female==0
eststo sum_male, addscalars(within_sd_p2 within_sd_p2_male within_sd_p1 within_sd_p1_male )
estpost prtest matched_712, by(male)
eststo prtest
estadd local N " ", replace

esttab full_sample sum_female sum_male prtest using "Output/Descriptive_table.tex", ///
cells( (mean(fmt(2) pattern (1 1 1 0)) b(star fmt(2) pattern (0 0 0 1))) (sd(par  pattern (1 1 1 0) fmt(2)))) ///
 nonumbers nomtitle nolines nogap  append  collabels(none) booktabs label f stats(new within_sd_p2 within_sd_p1 sep N, fmt(%1s %3.2fc %3.2fc %1s %9.0fc)  ///
 labels (" " "\textbf{Within-applicant stand.dev - Norm. P2 scores}" "\textbf{Within-applicant stand.dev - Norm. P1 scores}" " " "\textbf{\# Applicants}")) star(* 0.10 ** 0.05 *** 0.01)
